// This code simulates outcome variables using baseline data in order to estimate ex ante minimum detectable effects.  

set more off
set seed 999

clear

*Here you will need to insert your own filepath and uncomment to navigate to the Replication directory, if you want to run this dofile on its own.  

*cd "...."

//use baseline data

use "$data/temp/temp_analysis", clear

//clean up

keep if hh_found == "yes"
keep if consent_given == "yes"

//calculate proportion of safe sources in TU
encode village_code, g(village)

by village, sort: egen mean_arsenic_test_result_ws1 = mean(arsenic_test_result_ws1>=10)

by village, sort: egen mean_arsenic_test_result_ws1_who = mean(arsenic_test_result_ws1>=50)

//Baseline distance to drinking water source 1 in meters: hh_distance_final_ws1

//summarize hh_distance_final_ws1 by village

by village, sort: egen hh_distance_final_ws1_minim = min(hh_distance_final_ws1)

//Calculate baseline storage practice 

gen storage_b = (water_storage == "yes")

//calculate baseline water source arsenic concentration

gen ws_arsenic_test_result_b = (arsenic_test_result_ws1>= 10)

//calculate baseline household arsenic concentration

gen hh_arsenic_test_result_b = (hh_arsenic_field_res >= 10)

//rename baseline household fecal contamination

gen hh_enterbac_fc_b = hh_enterbac_fc

//rename baseline water source fecal contamination

gen ws_enterbac_fc_b = enterbac_fc_ws1

//calculate mean water source bacteria contamination

by village, sort: egen mean_ws_enterbac_fc = mean(enterbac_fc_ws1)

//recover success 

gen success_dummy = (success_rate> 0)

//average distance to the anchor: dist_hh_algoanchor dist_hh_deskanchor

//distance from true source, in villages where this data is available : final_distance_hh_constr

//Generate categorical variables for baseline arsenic contamination

gen ars_ws_0 = (arsenic_test_result_ws1 == 0)
gen ars_ws_l = (arsenic_test_result_ws1 > 0) * (arsenic_test_result_ws1 < 50)
gen ars_ws_h = (arsenic_test_result_ws1 >= 50) * (arsenic_test_result_ws1 <= 500)

gen anc_dist_cat = 0 
forvalues i = 1(1)10 {
	replace anc_dist_cat = `i' if dist_hh_deskanchor/80 >= `i'-1 & dist_hh_deskanchor/80 < `i'
}

gen true_dist_cat = 0 
forvalues i = 1(1)10 {
	replace true_dist_cat = `i' if final_distance_hh_constr/80 >= `i'-1 & final_distance_hh_constr/80 < `i'
}

//generate instruments 

//average switch rate by category

foreach d in 1 2 5 7 10 {	
	gen switch`d' = (switch_`d'min == "yes")
}

foreach cat in 0 l h {
	summarize switch1 if ars_ws_`cat' == 1
	local switch_rate_1_ars`cat' = r(mean)
	
	summarize switch2 if ars_ws_`cat' == 1
	local switch_rate_2_ars`cat' = r(mean)
	
	summarize switch5 if ars_ws_`cat' == 1
	local switch_rate_3_ars`cat' = r(mean)
	local switch_rate_4_ars`cat' = r(mean)
	local switch_rate_5_ars`cat' = r(mean)
	
	summarize switch7 if ars_ws_`cat' == 1
	local switch_rate_6_ars`cat' = r(mean)
	local switch_rate_7_ars`cat' = r(mean)
	
	summarize switch10 if ars_ws_`cat' == 1
	local switch_rate_8_ars`cat' = r(mean)
	local switch_rate_9_ars`cat' = r(mean)
	local switch_rate_10_ars`cat' = r(mean)

}	 		

gen switchrate = 0

foreach cat in 0 l h {
	forvalues d = 1(1)10 {
		replace switchrate = `switch_rate_`d'_ars`cat'' if ars_ws_`cat' == 1 & dist_hh_deskanchor/80 > `d'- 1 & dist_hh_deskanchor/80 <= `d'
	}
}
	
//Instruments and control variables

gen pred_FC_diff_control = switchrate * (0 - ws_enterbac_fc_b)
gen pred_FC_diff_instrument = switchrate * (0 - ws_enterbac_fc_b) * treated

gen pred_dist_diff_control = switchrate * (dist_hh_deskanchor - hh_distance_final_ws1)
gen pred_dist_diff_instrument = switchrate * (dist_hh_deskanchor - hh_distance_final_ws1) * treated

//create matrix for simulation results

mat results = J(500,38,.)

save $data/temp/temp, replace

//Simulate data 500 times

forvalues i = 1(1)500{

	use $data/temp/temp, clear

	//***********************Simulate counterfactual 

	//Estimate 1/3 proportion will change source in 2.5 years in any case.

	gen switch = (runiform()<0.33)

	//simulate follow-up arsenic status (under counterfactual)

	//assume that the arsenic safety of the new source is a draw from the distribution of sources in the village (builds in MORE noise than will actually happen). 

	gen ws_arsenic_test_result_f = (1-switch)* ws_arsenic_test_result_b + switch * (runiform()< mean_arsenic_test_result_ws1) 

	//For those who switch, assume that their new distance to safe drinking water is a draw from a uniform distribution between the minimum distance in their village and 50% more than their original distance

	gen dist_w_f = (1-switch)* hh_distance_final_ws1 + switch * (hh_distance_final_ws1_mins + runiform()* ((hh_distance_final_ws1 *1.5) - hh_distance_final_ws1_mins)) 

	//For those who switch at random, assume the bacterial safety of their new source is a draw from the distribution in their village

	gen ws_enterbac_fc_f = (1-switch)* enterbac_fc_ws1 + switch * (runiform()< mean_ws_enterbac_fc) 

	//Simulate a pattern of changing to adopt our own source

	//Assume that 1 minute is 80m and convert to m

	gen truediff_min = final_distance_hh_constr/80

	gen ourswitch = 0 

	replace ourswitch = 1 if switch_1min == "yes" & truediff_min < 1 
	replace ourswitch = 1 if switch_2min == "yes" & truediff_min >= 1 & truediff_min < 2
	replace ourswitch = 1 if switch_5min == "yes" & truediff_min >= 2 & truediff_min < 5
	replace ourswitch = 1 if switch_7min == "yes" & truediff_min >= 5 & truediff_min < 7
	replace ourswitch = 1 if switch_10min == "yes" & truediff_min >= 7 & truediff_min < 10
	replace ourswitch = 1 if switch_12min == "yes" & truediff_min >= 10 & truediff_min < 12

	//estimate followup water source contamination

	replace ws_arsenic_test_result_f = 0 if ourswitch == 1 & success_dummy == 1 & (ws_enterbac_fc_b == 1 | ws_arsenic_test_result_b == 1)

	//simulate hs_arsenic_test_result_f

	areg hh_arsenic_test_result_b ws_arsenic_test_result_b, absorb(village)

	local corr = _b[ws_arsenic_test_result_b]
	local int = _b[_cons]

	predict hh_ars_resid, resid
	summarize hh_ars_resid
	local resid_sd = r(sd)

	gen hh_arsenic_test_result_f = `int' + `corr'*ws_arsenic_test_result_f + `resid_sd' * rnormal() 
	replace hh_arsenic_test_result_f = (runiform()<= hh_arsenic_test_result_f)

	//simulate follow-up water source contamination

	replace ws_enterbac_fc_f = 0 if ourswitch == 1 & success_dummy == 1  & (ws_enterbac_fc_b == 1 | ws_arsenic_test_result_b == 1)

	//simulate follow-up distance to safe water

	replace dist_w_f = final_distance_hh_constr if ourswitch == 1 & success_dummy == 1 & (ws_enterbac_fc_b == 1 | ws_arsenic_test_result_b == 1)
	
	//simulate storage practices at followup

	gen storage_b_obs = (source_test_stored == 1)

	//Take correlation between storage measures as a measure of correlation in storage practices over time (not ideal but best available strategy) 

	areg storage_b storage_b_obs hh_distance_final_ws1, absorb(village)

	local distcorr = _b[hh_distance_final_ws1]
	local selfcorr = _b[storage_b_obs]
	local int = _b[_cons]

	predict storage_resid, resid
	sum storage_resid
	local resid_sd = r(sd)
	
	//Predict storage behaviour at follow-up

	gen storage_f = `int' + `selfcorr'*storage_b + `distcorr' *dist_w_f + `resid_sd' * rnormal() 

	//somewhat heroically, predict household water contamination at followup

	areg hh_enterbac_fc_b ws_enterbac_fc_b hh_distance_final_ws1 storage_b, absorb(village)

	predict hh_bac_pred, xb
	gen hh_bac_pred_adj = hh_bac_pred - 0.3 * ws_enterbac_fc_b - 0.0002 * hh_distance_final_ws1 - 0.05 * storage_f
	sum hh_bac_pred_adj
	local int = r(mean)

	areg hh_enterbac_fc_b ws_enterbac_fc_b hh_distance_final_ws1 storage_b, absorb(village)

	predict residuals, resid
	sum residuals

	local resid_sd = r(sd)

	gen hh_enterbac_fc_f_pred = `int' + 0.3 * ws_enterbac_fc_f + 0.0002 * dist_w_f + 0.05* storage_f + `resid_sd' * rnormal() 

	replace hh_enterbac_fc_f = (runiform()<= hh_enterbac_fc_f)

	//generate outcomes 1a and 1b

	gen hh_arsenic_test_result_diff = hh_arsenic_test_result_f - hh_arsenic_test_result_b

	gen hh_enterbac_fc_diff = hh_enterbac_fc_f - hh_enterbac_fc_b

	//generate outcomes 2a, 2b, 2c and 2d

	gen ws_arsenic_test_result_diff = ws_arsenic_test_result_f - ws_arsenic_test_result_b

	gen ws_enterbac_fc_diff = ws_enterbac_fc_f - ws_enterbac_fc_b

	gen dist_w_diff = dist_w_f - hh_distance_final_ws1

	gen storage_diff = storage_f - storage_b
	
	local j = 0 
	
	foreach var in hh_arsenic_test_result_diff hh_enterbac_fc_diff ws_arsenic_test_result_diff ws_enterbac_fc_diff dist_w_diff storage_diff {
		quiet reg `var' treated, vce(cluster village)
		
		quiet mat results[`i',`j'*3 + 1] = _b[treated]
		quiet mat results[`i',`j'*3 + 2] = _se[treated]
		quiet test treated	
		quiet mat results[`i',`j'*3 + 3]  = r(p)
	
		local j = `j' + 1
	
	}	

	//run DinD

	quiet reg hh_enterbac_fc_diff ws_enterbac_fc_diff dist_w_diff storage_diff, vce(cluster village)
	
	foreach var in ws_enterbac_fc_diff dist_w_diff storage_diff {
		
		quiet mat results[`i',`j'*3 + 1] = _b[`var']
		quiet mat results[`i',`j'*3 + 2] = _se[`var']
		quiet test `var'	
		quiet mat results[`i',`j'*3 + 3]  = r(p)
	
		local j = `j' + 1
	
	}
	
	//run IV		

	quiet ivreg2 hh_enterbac_fc_diff  storage_diff pred_FC_diff_control pred_dist_diff_control (ws_enterbac_fc_diff dist_w_diff = pred_FC_diff_instrument pred_dist_diff_instrument), cluster(village) first
	
	matrix A = e(first)
	quiet mat results[`i',`j'*3 + 1] = A[8,1]
	
	matrix A = e(first)
	quiet mat results[`i',`j'*3 + 2] = A[8,2]
	
	local j = `j' + 1

	foreach var in ws_enterbac_fc_diff dist_w_diff storage_diff {
		
		quiet mat results[`i',`j'*3 ] = _b[`var']
		quiet mat results[`i',`j'*3 + 1] = _se[`var']
		quiet test `var'	
		quiet mat results[`i',`j'*3 + 2]  = r(p)
	
		local j = `j' + 1
	
	}

}

svmat results

//Calculate minimum detectable effects

//1a) arsenic in household water

summarize results1

disp 2.8 * r(sd)

//1b) bacteria in household water

summarize results4

disp 2.8 * r(sd)

//2a) source arsenic

summarize results7

disp 2.8 * r(sd)

//2b) source bacteria

summarize results10

disp 2.8 * r(sd)

//2c) distance to source

summarize results13

disp 2.8 * r(sd)

//2d) storage practice 

summarize results16

disp 2.8 * r(sd)

//Summarize first stage results

summarize results28
summarize results29



//the end